setwd('~/Dropbox/Abortion/Pre_Roe_Stuff/Paper Files/JLC submission/Replication files')

#TO INSTALL MRP PACKAGE FROM GITHUB, RUN THESE COMMANDS
#library(devtools) ; install_github("mrp", "malecki", sub="mrpdata")
#library(devtools) ; install_github("mrp", "malecki", sub="mrp")

rm(list=ls(all=TRUE))

#load packages
library(boot)
library(arm)
library(foreign)
library(car)
library(blme)
library(plyr)
library(mrp)
library(mrpdata)
#loads some data I might use
# data(mrp.regions) # regions data.frame with DC separate
# data(spmap.states) # projected US state map
# # data(mrp.census)   # census with common demo strata

census.1970 <- read.dta("census uncompressed 1970.dta", convert.underscore=T)

#age is a factor -- fix here so can drop less than 18 year olds
census.1970$age.temp <- as.numeric(
	ifelse(census.1970$age=="Less than 1 year old", 1,
	ifelse(census.1970$age=="90 (90+ in 1980 and 1990)", 90,
	ifelse(census.1970$age=="100 (100+ in 1970))", 100,
	ifelse(census.1970$age=="112 (112+ in the 1980 internal data)", 112,
	ifelse(census.1970$age=="115 (115+ in the 1990 internal data)", 115, census.1970$age)))) ))-1#for some reason it is off by one [checked against Stata]
	
#race 
census.1970$black <- ifelse(census.1970$racesingd == "Black",1,0)
census.1970$hispanic <- ifelse(census.1970$hispand == "Not Hispanic",0,1)
census.1970$crace.wb <- ifelse(census.1970$black==1, 2,1)
census.1970$crace.wbh <- ifelse(census.1970$black==1,2, ifelse(census.1970$hispanic ==1, 3,1))

#subset by age and citizenship
census.1970.cut <- subset(census.1970, age.temp>=18 & citizen!="Not a citizen" )

#create categorical variables
census.1970.cut$cedu.cat <- ifelse(
	census.1970.cut$educrec == "None or preschool" | census.1970.cut$educrec=="Grade 1, 2, 3, or 4" | census.1970.cut$educrec=="Grade 5, 6, 7, or 8",1,
	ifelse(census.1970.cut$educrec == "Grade 9" | 	census.1970.cut$educrec == "Grade 10" | census.1970.cut$educrec == "Grade 11", 2,
	ifelse(census.1970.cut$educrec == "Grade 12", 3,
	ifelse(census.1970.cut$educrec == "1 to 3 years of college" | census.1970.cut$educrec == "4+ years of college",
	4, NA))))
census.1970.cut$crace.wb <- ifelse(census.1970.cut$racesingd == "Black", 2,1)
census.1970.cut$cfemale <- ifelse(census.1970.cut$sex=="Female", 1,0)

census.1970.cut$cage.cat	<- ifelse(census.1970.cut$age.temp >= 18 & census.1970.cut$age.temp <=29, 1,
		ifelse(census.1970.cut$age.temp >= 30 & census.1970.cut$age.temp <=44, 2,
		ifelse(census.1970.cut$age.temp >= 45 & census.1970.cut$age.temp <=64, 3,
		ifelse(census.1970.cut$age.temp >=65, 4,NA))))

#make poststrat--just using Race WB for now b/c polls do not ask Hispanic
poststrat.1970 <- 
	as.data.frame(ftable(census.1970.cut[c("statefip","cedu.cat", "cfemale", "cage.cat", "crace.wb")]))

#merge state abbreviations
temp <- data.frame(state.name,state.abb)
temp$statefip <- temp$state.name
temp.row <- data.frame("District of Columbia", "DC", "District of Columbia")
colnames(temp.row) <- colnames(temp)
temp <- rbind(temp, temp.row)

dim(poststrat.1970)
poststrat.1970 <- merge(poststrat.1970, temp, add.x=T)
dim(poststrat.1970)
poststrat.1970$cstate <- poststrat.1970$state.abb
poststrat.1970$statefip <- NULL
poststrat.1970$state.abb <- NULL
poststrat.1970$state.name <- NULL
poststrat.1970$cfreq <- poststrat.1970$Freq
poststrat.1970$Freq <- NULL
head(poststrat.1970)

#calculate frequencies by state and percentage of each cell within state
poststrat.1970$cfreq_state <- tapply(poststrat.1970$cfreq, poststrat.1970$cstate, sum)[poststrat.1970$cstate]
poststrat.1970$cpercent_state <- poststrat.1970$cfreq / poststrat.1970$cfreq_state
poststrat.1970 <- poststrat.1970[order(poststrat.1970$cstate,poststrat.1970$crace.wb, poststrat.1970$cage.cat,  poststrat.1970$cedu.cat),]
#add region
poststrat.1970 <- within(poststrat.1970,{
	cregion <- ifelse( cstate =="CT" | cstate =="ME" | cstate =="MA" | cstate =="NH" | cstate =="RI" | cstate =="VT" | cstate =="NJ" | cstate =="NY" | cstate =="PA", "northeast",
	ifelse( cstate =="IN" | cstate =="IL" | cstate =="MI" | cstate =="OH" | cstate =="WI" | 	cstate =="IA" | 	cstate =="KS"  | cstate =="MN"  | cstate =="MO"  | cstate =="NE"  | cstate =="ND"  | cstate =="SD", "midwest",
	ifelse(cstate =="DE"  | cstate =="FL"  | cstate =="GA" | cstate =="MD"  | cstate =="NC"  | 	cstate =="SC"  | cstate =="VA"  | cstate =="WV" | cstate =="AL"  | cstate =="KY" | 	cstate =="MS" | cstate =="TN"  | cstate =="AR"  | cstate =="LA" | cstate =="OK"  | 	cstate =="TX", "south",
	ifelse(cstate =="AZ"  | cstate =="CO"  | cstate =="ID" | cstate =="NM"   | cstate =="MT"  | cstate =="UT"  | cstate =="NV" | cstate =="WY"  | cstate =="AK"  | cstate =="CA" |		cstate =="HI"  | cstate =="OR"  | cstate =="WA", "west",
  ifelse( cstate =="DC", "DC", NA)))))
})
table(poststrat.1970$cstate)
table(poststrat.1970$cregion)

write.dta(poststrat.1970, "1970_poststrat_race_wb.dta")

#now create syntehic poststrat that uses Lucas Leeman's MRP* idea.
#first, read in percent catholic data by state (from statelevel data)
data.cath <- read.dta("statelevel_to_use_in_mrp.dta")
data.cath$cstate <- data.cath$sstate
data.cath <- subset(data.cath, select=c(cstate, cath_adherents_prop_1970))
data.cath$cath.share <- data.cath$cath_adherents_prop_1970
data.cath$cath_adherents_prop_1970 <- NULL

head(poststrat.1970)

poststrat.1970.synthetic <- join(poststrat.1970, data.cath)
dim(poststrat.1970.synthetic)
head(poststrat.1970.synthetic)

#add cath variable
# poststrat.1970.synthetic$ccath0 <- 0
# poststrat.1970.synthetic$ccath1 <- 1
dim(poststrat.1970.synthetic)
head(poststrat.1970.synthetic)

#now replicate 2 times
poststrat.1970.synthetic <- rbind(poststrat.1970.synthetic,poststrat.1970.synthetic)
#poststrat.1970.synthetic <- kronecker(poststrat.1970.synthetic1, c(1,1))

dim(poststrat.1970.synthetic)
head(poststrat.1970.synthetic)

#now order by "state-type"
poststrat.1970.synthetic <- within (poststrat.1970.synthetic,{
 state.type <- paste(cstate,cedu.cat,cfemale,cage.cat,crace.wb)
})
poststrat.1970.synthetic <- poststrat.1970.synthetic[order(poststrat.1970.synthetic$state.type),]
#now create new indicator for catholic cell (so each state-type gets a 1 and 0 in two separate rows)
poststrat.1970.synthetic$ccath <- rep(seq(0,1), nrow(poststrat.1970.synthetic)/2)
head(poststrat.1970.synthetic)

#new frequency
poststrat.1970.synthetic$cpercent_state_new <- ifelse(poststrat.1970.synthetic$ccath==1, 
	poststrat.1970.synthetic$cpercent_state*poststrat.1970.synthetic$cath.share, 
	poststrat.1970.synthetic$cpercent_state*(1-poststrat.1970.synthetic$cath.share))
#confirm it worked
with (poststrat.1970.synthetic,{
	print(tapply(cpercent_state,cstate,sum))
	print(tapply(cpercent_state_new,cstate,sum))
})

#overwrite old
poststrat.1970.synthetic$cpercent_state <- poststrat.1970.synthetic$cpercent_state_new
poststrat.1970.synthetic$cpercent_state_new <- NULL

head(poststrat.1970.synthetic)

write.dta(poststrat.1970.synthetic, "1970_poststrat_race_wb_synthetic.dta")







